In [19]:
import os
import pathlib
import pickle

import earthaccess
import earthpy as et
import earthpy.earthexplorer as etee
import geopandas as gpd
import geoviews as gv
import gitpass
import glob
import holoviews as hv
import hvplot.pandas
import hvplot.xarray
import numpy as np
import pandas as pd
import rasterio
import requests
import rioxarray as rxr
import rioxarray.merge as rxrm
import shapely
import xarray as xr
import xrspatial
from sklearn.cluster import KMeans

data_dir = os.path.join(et.io.HOME, et.io.DATA_NAME)

os.environ["GDAL_HTTP_MAX_RETRY"] = "5"
os.environ["GDAL_HTTP_RETRY_DELAY"] = "1"
In [2]:
# Caching decorator

def cached(key, override=False):
    """"""
    def compute_and_cache_decorator(compute_function):
        """"""
        def compute_and_cache(*args, **kwargs):
            """Perform a computation and cache, or load cached result"""
            filename = os.path.join(et.io.HOME, et.io.DATA_NAME, 'jars', f'{key}.pickle')
            
            # Check if the cache exists already or override caching
            if not os.path.exists(filename) or override:
                # Make jars directory if needed
                os.makedirs(os.path.dirname(filename), exist_ok=True)
                
                # Run the compute function as the user did
                result = compute_function(*args, **kwargs)
                
                # Pickle the object
                with open(filename, 'wb') as file:
                    pickle.dump(result, file)
            else:
                # Unpickle the object
                with open(filename, 'rb') as file:
                    result = pickle.load(file)
                    
            return result
        
        return compute_and_cache
    
    return compute_and_cache_decorator
In [7]:
#Download WBD spatial data

wbd_path = os.path.join(
    data_dir,
    'earthpy-downloads',
    'WBD_18_HU2_Shape',
    'Shape',
    'WBDHU10.shp'
)

wbd_url = ("https://prd-tnm.s3.amazonaws.com/StagedProducts/"
           "Hydrography/WBD/HU2/Shape/WBD_18_HU2_Shape.zip"
          )

if not os.path.exists(wbd_path):
    print('downloading ' + wbd_url)  
    wbd_zip = et.data.get_data(url=wbd_url)
else:
    print(wbd_path + " already exists")

# Create a GDF and plot the HUC    
    
wbd_gdf = gpd.read_file(wbd_path)
huc_gdf = wbd_gdf[wbd_gdf['huc10'] == '1809010202']
gv.tile_sources.EsriImagery() * gv.Polygons(huc_gdf).opts(
    fill_alpha=0, line_color='blue', title='HUC 180901020204',
    height=600, width=600
)
C:\Users\Pete\earth-analytics\data\earthpy-downloads\WBD_18_HU2_Shape\Shape\WBDHU10.shp already exists
Out[7]:

Site Description¶

HUC 1809010202 is a HUC 10 watershed located in the Eastern Sierra region of California. This watershed is on the east side of the Sierra Crest, a significant geographic boundary in the western United States. Multiple major land forms and features exist within this watershed including large, high elevation mountains, alpine forests, high desert, alpine lakes, and the town Mammoth Lakes, CA and the ski area Mammoth Mountain.

In [8]:
# Search multispectral data from Earthdata

earthaccess.login(persist=True)

results = earthaccess.search_data(
    short_name="HLSL30",
    cloud_hosted=True,
    bounding_box=tuple(huc_gdf.total_bounds),
    temporal=("2023-05-01", "2023-09-30"),
)
Granules found: 18
In [9]:
# Open the Earthaccess results so you only have to do it once
@cached("open_results", False)
def open_results_and_cache(results):
    open_results = earthaccess.open(results)
    return open_results
open_results = open_results_and_cache(results)
Opening 18 granules, approx size: 3.47 GB
QUEUEING TASKS | :   0%|          | 0/270 [00:00<?, ?it/s]
PROCESSING TASKS | :   0%|          | 0/270 [00:00<?, ?it/s]
COLLECTING RESULTS | :   0%|          | 0/270 [00:00<?, ?it/s]
In [ ]:
url_results = [file_obj.url for file_obj in open_results]
In [ ]:
# Download all the tif files since I can't get rasterio to open them
def download_files(urls, directory):
    for url in urls:
        # Extract filename from URL
        filename = url.split('/')[-1]
        # Construct path to save file
        filepath = os.path.join(directory, filename)
        
        # Check if file already exists
        if os.path.exists(filepath):
            print(f"{filename} already exists. Skipping download.")
            continue
        
        # Download the file
        print(f"Downloading {filename}...")
        response = requests.get(url)
        
        # Check if download was successful
        if response.status_code == 200:
            # Save the file
            with open(filepath, 'wb') as file:
                file.write(response.content)
            print(f"Downloaded {filename} successfully!")
        else:
            print(f"Failed to download {filename}.")
            
dl_path = os.path.join(
    data_dir,
    'earthpy-downloads'
)

download_files(url_results, dl_path)
In [83]:
# Process the granules from Earthdata
def create_granule_df(earthdata_results, open_results):
    """
    Process the granules returned by an Earthdata search

    Parameters
    ==========
    earthdata_results : list
      A list object returned by earthacces.search_data().
      This contains the result granules from the Earthdata search.
      
    open_results : opened earthdata_results object

    Returns
    =======
    granule_df : DataFrame
      A dataframe with a row for each granule file.
      Each row will contain the granule ID, the download URL,
      the datetime, the tile ID, and the band.
    """
    columns = ['geometry','granule_id', 'file_url', 'tile_id', 'datetime', 'band_number']
    granule_df = gpd.GeoDataFrame(columns=columns)
    # Loop through each file in the granule results and accumulate attributes
    for url in open_results:
        file_url = str(url.url)
        # Should probably use regular expression here
        gran_id = file_url[73:107]
        tile_id = file_url[81:87]
        datetime = file_url[88:95]
        band = file_url[-7:-4]
        if band == 'ask':
            band = 'Fmask'
        # Find granule geometry based on granule ID
        for result in earthdata_results:
            result_attr = result['umm']
            result_gran_id = result_attr['GranuleUR']
            if result_gran_id == gran_id:
                extent = result_attr['SpatialExtent']
                geom_extent=extent['HorizontalSpatialDomain']['Geometry']['GPolygons']
                coord_list = []
                for item in geom_extent:
                    poly = item['Boundary']['Points']
                    for vertex in poly:
                        longitude = vertex['Longitude']
                        latitude = vertex['Latitude']
                        coord_list.append((longitude, latitude))
                granule_geometry = shapely.geometry.Polygon(coord_list)
        # Append info for each file to dataframe
        new_row = pd.DataFrame({'geometry': [granule_geometry],
                   'granule_id': [gran_id],
                   'file_url': [file_url],
                   'tile_id': [tile_id],
                   'datetime': [datetime],
                   'band_number': [band]
                  })
        granule_df = pd.concat([granule_df, new_row], ignore_index=True)
    return granule_df
In [15]:
@cached("download_df", False)
def do_something(results, open_results):
    download_df = create_granule_df(results, open_results)
    return download_df
download_df = do_something(results, open_results)
In [84]:
# Function to compute image mask
def process_tifs(download_df, huc_gdf):
    """
    Load and process the Fmask and B band .tif files
    in a granule.

    Parameters
    ==========
    download_df : pandas.core.groupby.generic.DataFrame
      Dataframe containing all the rows for each band .tif for all the files.
    
    huc_gdf : GeoDataFrame
      Geodataframe containing the geometry of the HUC watershed used to clip

    Returns
    =======
    accumulator_df : Pandas dataframe
      Dataframe containing the processed rows for all the tifs.
      An xarray has been added for each processed .tif in the last column.
    """
    grouped = download_df.groupby('granule_id')
    columns = ['geometry',
               'granule_id', 
               'file_url', 
               'tile_id', 
               'datetime', 
               'band_number',
               'processed_da']
    accumulator_df = pd.DataFrame(columns=columns)
    # Looping through each Granule and processing data
    # Create cloud mask
    for granule_id, group_data in grouped:
        print("Processing data for Granule ID:", granule_id)
        granule_gdf = group_data
        fmask_gdf = group_data['band_number'] == "Fmask"
        fmask_gdf = group_data[fmask_gdf].iloc[0:1]
        file_name = (
            fmask_gdf.iloc[0]['granule_id'] +
            "." + fmask_gdf.iloc[0]['band_number'] + '.tif'
        )
        file_path = data_dir + '\\earthpy-downloads\\' + file_name
        fmask_da = rxr.open_rasterio(file_path).squeeze()
        huc_gdf_crs = huc_gdf.to_crs(fmask_da.rio.crs)
        fmask_crop_da = fmask_da.rio.clip_box(*huc_gdf_crs.total_bounds)
        mask = compute_mask(fmask_crop_da)
        # Apply crop, scale factor, and mask to all "B" files
        for index, row in group_data.iterrows():
            if (row['band_number'].startswith('B')
                and row['band_number'] not in ('B10', 'B11')):
                file_name = row['granule_id'] + "." + row['band_number'] + '.tif'
                file_id = row['granule_id'] + "." + row['band_number']
                file_path = data_dir + '\\earthpy-downloads\\' + file_name
                bband_da = rxr.open_rasterio(file_path).squeeze()
                huc_gdf = huc_gdf.to_crs(bband_da.rio.crs)
                bband_crop_da = bband_da.rio.clip_box(*huc_gdf_crs.total_bounds)
                bband_crop_da = bband_crop_da.where(bband_crop_da >= 0, np.nan)
                scale_factor = 0.0001 
                bband_crop_da = bband_crop_da * scale_factor
                processed_da = bband_crop_da.where(mask)
                add_row = pd.DataFrame({'geometry': row['geometry'],
                    'granule_id': row['granule_id'],
                    'file_url': row['file_url'],
                    'tile_id': row['tile_id'],
                    'datetime': row['datetime'],
                    'band_number': row['band_number'],
                    'processed_da': [processed_da]})
                accumulator_df = pd.concat([accumulator_df, add_row])
    return accumulator_df

def compute_mask(da, mask_bits = [1,2,3]):
    """
    Compute a mask layer from the Fmask

    Parameters
    ==========
    da : rioxarray
      An array of the Fmask layer to use to compute the mask
      
    bits : list
      A list of bits to exclude, documentation is here on page 21:
      https://hls.gsfc.nasa.gov/wp-content/uploads/2019/01/HLS.v1.4.UserGuide_draft_ver3.1.pdf

    Returns
    =======
    mask : rioxarray
      An array with computed mask values
    """
    # Unpack bits in the Fmask array
    # Need to reverse order of bits from most significant to little
    bits = np.unpackbits(da.astype(np.uint8), bitorder = 'little').reshape(da.shape + (-1,))
    
    # Select the bits to use for the mask
    mask = np.prod(bits[..., mask_bits]==0, axis=-1)
    return mask
    
In [89]:
# Run methods to process all .tifs and create accumulator dataframe
@cached("accumulator_df", False)
def process_tifs_and_cache(download_df):
    accumulator_df = process_tifs(download_df, huc_gdf)
    return accumulator_df
accumulator_df = process_tifs_and_cache(download_df)
Processing data for Granule ID: HLS.L30.T11SLB.2023128T183322.v2.0
Processing data for Granule ID: HLS.L30.T11SLB.2023136T183258.v2.0
Processing data for Granule ID: HLS.L30.T11SLB.2023144T183311.v2.0
Processing data for Granule ID: HLS.L30.T11SLB.2023152T183301.v2.0
Processing data for Granule ID: HLS.L30.T11SLB.2023160T183307.v2.0
Processing data for Granule ID: HLS.L30.T11SLB.2023168T183311.v2.0
Processing data for Granule ID: HLS.L30.T11SLB.2023176T183301.v2.0
Processing data for Granule ID: HLS.L30.T11SLB.2023184T183321.v2.0
Processing data for Granule ID: HLS.L30.T11SLB.2023192T183318.v2.0
Processing data for Granule ID: HLS.L30.T11SLB.2023200T183324.v2.0
Processing data for Granule ID: HLS.L30.T11SLB.2023208T183320.v2.0
Processing data for Granule ID: HLS.L30.T11SLB.2023216T183334.v2.0
Processing data for Granule ID: HLS.L30.T11SLB.2023224T183329.v2.0
Processing data for Granule ID: HLS.L30.T11SLB.2023240T183338.v2.0
Processing data for Granule ID: HLS.L30.T11SLB.2023248T183340.v2.0
Processing data for Granule ID: HLS.L30.T11SLB.2023256T183347.v2.0
Processing data for Granule ID: HLS.L30.T11SLB.2023264T183347.v2.0
Processing data for Granule ID: HLS.L30.T11SLB.2023272T183344.v2.0
In [90]:
# Group the DataFrame by 'band_number' and 'datetime'
grouped_band_df = accumulator_df.groupby('band_number')

# Iterate through each band
band_accumulator = []
for band_number, band_data in grouped_band_df:
    # Iterate over each date
    date_accumulator = []
    for datetime, date_data in band_data.groupby('datetime'):
        # Merge data from all 4 granules
        merged_date_array = rxrm.merge_arrays(date_data['processed_da'].values)
        # Add datetime dimension
        merged_date_array = merged_date_array.assign_coords(date=datetime)
        # Mask any negative values
        merged_date_array = xr.where(merged_date_array <= 0, np.nan, merged_date_array)
        # Append to accumulator list
        date_accumulator.append(merged_date_array)
    
    # Concatenate the merged DataArrays along a new 'date' dimension
    date_accumulator = xr.concat(date_accumulator, dim='datetime')
    # Take the mean along the 'date' dimension to create a composite image
    composite_image = date_accumulator.mean(dim='datetime', skipna=True)
    
    # Add the 'band_number' as a new dimension and give the DataArray a name
    composite_image = composite_image.assign_coords(band_number=band_number)
    #composite_image.name = str(band_number) + "_array"
    #print(composite_image.name)
    
    band_accumulator.append(composite_image)

# Concatenate along the 'band_number' dimension
all_bands_array = xr.concat(band_accumulator, 'band_number')
In [103]:
# Organize data array of bands
all_bands_array.name = 'reflectance'
model_df = (
    all_bands_array
    .sel(band_number=['B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B09'])
    .to_dataframe()
    .reflectance
    .unstack('band_number')
    .dropna()
)

# Fit KMeans model to my data
k_means = KMeans(n_clusters=6)
model_df['category'] = k_means.fit_predict(model_df)
C:\Users\Pete\miniconda3\envs\earth-analytics-python\lib\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
In [104]:
# Prepare data to display KMeans categories
model_array = model_df.category.to_xarray()
model_plot = (model_df
              .category
              .to_xarray()
              .sortby(['x','y'])
              .hvplot(x='x', y='y', colormap='Colorblind', aspect=1, title='KMeans Categories')
             )

# Prepare the data to display as RGB
rgb = all_bands_array.sel(band_number=['B04', 'B03', 'B02'])
rgb_unint8 = (rgb * 40).astype(np.uint8())
rgb_brighten = rgb_unint8 * 10
rgb_plot = (
    rgb_brighten
    .hvplot
    .rgb(y='y', x='x', bands='band_number', aspect=1, colormap='RGB', title='RGB Aerial Image')
)

rgb_plot + model_plot
C:\Users\Pete\miniconda3\envs\earth-analytics-python\lib\site-packages\xarray\core\duck_array_ops.py:203: RuntimeWarning: invalid value encountered in cast
  return data.astype(dtype, **kwargs)
Out[104]:

Unsupervised Land Cover Classification for HUC 180901020204¶

Visual inspection of the RGB image lets us assume that this watershed is dominated by multiple land cover classes including snowy mountainous areas, mountains, forests, high desert, open water, a variety of upland vegetated communities, and developed areas such as roads and buildings.

We ran K Means classification using 6 classification categories. Inspecting the results of the classification along side the RGB image shows that KMeans does a good job of classifying large continuous land cover areas but has issues with areas that are not as homogenous. The large lakes in the east side of the water shed are being classified in the same category as upland areas that appear to have coniferous forest cover.

It appears that upland categories are being classified as well but it is difficult to assess the accuracy of these categories without having on the ground validation of what different communities exist in the upland areas. As such, K Means is a useful exploratory tool but we can not use it to confidently identify and classify areas of land cover with diverse vegetation types.

In [ ]: